1 Introduction

It looks like Raustats might not be what I am looking for to get SLA level census data quickly. Let"s try Census2016 from Hugh Parsonage.

full_2016_census <- Census2016_wide_by_SA2_year %>%
  filter(year == "2016")
head(full_2016_census)

Yes - this is what I need.

Loading other tables

ancestories_2016 <- Census2016_ancestories %>%
  filter(year == "2016")
countries_of_birth_2016 <- Census2016_countries_of_birth %>%
  filter(year == "2016")
languages_2016 <- Census2016_languages %>%
  filter(year == "2016")
religions_2016 <- Census2016_religions %>%
  filter(year == "2016")

2 Feature Engineering

Each of the four variables ancestory, country of birth, languages and religions are quite granular, and it may make sense to look at these variables at a lower level of granularity.

Country of birth

if (!file.exists("./Data/country_classification.xls")) {
  download.file("https://www.abs.gov.au/ausstats/subscriber.nsf/log?openagent&sacc_12690do0001_201903.xls&1269.0&Data%20Cubes&480BD730AF42D515CA2583BD007707C5&0&2016&15.03.2019&Latest", "./Data/country_classification.xls", method = "libcurl")
}
country_classification_4dig <-
  readxl::read_xls("./Data/country_classification.xls",
                   sheet = "Table 1.3", skip = 7,
                   col_names = c("X1", "X2", "Country_Code_4", "Country")) %>%
  filter(!is.na(Country)) %>%
  dplyr::select(-X1, -X2)

country_classification_2dig <-
  readxl::read_xls("./Data/country_classification.xls",
                   sheet = "Table 1.2", skip = 6,
                   col_names = c("X1", "Country_Code_2", "Country_Name_2")) %>%
  filter(!is.na(Country_Name_2)) %>%
  dplyr::select(-X1)

country_classification_1dig <-
  readxl::read_xls("./Data/country_classification.xls",
                   sheet = "Table 1.1", skip = 5,
                   col_names = c("Country_Code_1", "Country_Group")) %>%
  filter(!is.na(Country_Group))

Language

if (!file.exists("./Data/language_classification.xls")) {
  download.file("https://www.abs.gov.au/AUSSTATS/subscriber.nsf/log?openagent&ASCL_12670DO0001_201703.xls&1267.0&Data%20Cubes&F84620CF6E13F7E8CA257FF1001E68A7&0&2016&28.03.2017&Latest", "./Data/language_classification.xls", method = "libcurl")
}



language_classification_4dig <-
  readxl::read_xls("./Data/language_classification.xls",
                   sheet = "Table 1.3", skip = 8,
                   col_names =
                     c("X1", "X2", "X3", "X4",
                       "Language_Code_3", "Language")) %>%
  filter(!is.na(Language)) %>%
  dplyr::select(-X1, -X2, -X3, -X4)

language_classification_1dig <-
  readxl::read_xls("./Data/language_classification.xls",
                   sheet = "Table 1.1", skip = 5,
                   col_names = c("Language_Code_1", "Language_Group")) %>%
  filter(!is.na(Language_Group))

Religion

if (!file.exists("./Data/religion_classification.xls")) {
  download.file("https://www.abs.gov.au/AUSSTATS/subscriber.nsf/log?openagent&ASCRG_12660DO0001_201707.xls&1266.0&Data%20Cubes&B3EAFE3FE6180D37CA257FF1001E673C&0&2016&14.07.2017&Latest", "./Data/religion_classification.xls", method = "libcurl")
}


religion_classification_3dig <-
  readxl::read_xls("./Data/religion_classification.xls",
                   sheet = "Table 1.2", skip = 6,
                   col_names = c("X1", "Religion_Code_3", "Religion")) %>%
  filter(!is.na(Religion)) %>%
  dplyr::select(-X1)

religion_classification_1dig <-
  readxl::read_xls("./Data/religion_classification.xls",
                   sheet = "Table 1.1", skip = 5,
                   col_names = c("Religion_Code_1", "Religion_Group")) %>%
  filter(!is.na(Religion_Group))

3 Combine with election data

Aggregate at the SA2 level - add unless variable contains median, average, persons_per_bedroom

census_2016_all_vars <- Census2016_wide_by_SA2_year %>%
  filter(year == "2016") %>%
  rowwise() %>%
  mutate(sa2_id = paste0(substr(sa2_code, 1, 1), substr(sa2_code, 6, 9))) %>%
  filter(isMissing == FALSE) %>%
  mutate(percent_female = female / persons,
         percent_defacto = defacto_persons / persons,
         percent_married = married_persons / persons,
         percent_indig = indig_persons / persons,
         percent_born_in_australia = born_in_australia / persons,
         percent_unit = flat_or_unit / n_dwellings,
         percent_mortgage = dwelling_owned_mortgage / n_dwellings,
         percent_rent = dwelling_rented / n_dwellings)


census_2016_means <- census_2016_all_vars %>%
  dplyr::select(median_age, median_household_income, average_household_size,
                persons_per_bedroom, median_weekly_rent,
                median_annual_mortgage, sa2_id) %>%
  group_by(sa2_id) %>%
  summarise_all(mean, na.rm = TRUE)

census_2016_counts <- census_2016_all_vars %>%
  dplyr::select(n_dwellings, persons, female, male,
                married_persons, married_females, married_males,
                defacto_persons, defacto_females, defacto_males,
                notmarried_persons, notmarried_females, notmarried_males,
                indig_persons, indig_males, indig_females, non_indig_persons,
                non_indig_females, non_indig_males, not_stated_indig_persons,
                not_stated_indig_males, not_stated_indig_females,
                born_in_australia, born_overseas, country_not_stated,
                separate_house, flat_or_unit,
                housing_other_or_not_stated, semi_or_townhouse,
                dwelling_owned_outright, dwelling_owned_mortgage,
                dwelling_other_or_not_stated, dwelling_rented, sa2_id) %>%
  group_by(sa2_id) %>%
  summarise_all(sum, na.rm = TRUE) %>%
  mutate(percent_female = female / persons,
         percent_defacto = defacto_persons / persons,
         percent_married = married_persons / persons,
         percent_indig = indig_persons / persons,
         percent_born_in_australia = born_in_australia / persons,
         percent_unit = flat_or_unit / n_dwellings,
         percent_mortgage = dwelling_owned_mortgage / n_dwellings,
         percent_rent = dwelling_rented / n_dwellings)

So what I need is weighted demographic data for each of the polling places based on the number of people from each SLA2 who voted at the polling place. Since we don’t know who voted where, and who can vote at all, we are making the naive assumptions that * Voters at each SLA are similar * Voters are representative of census respondents at the SLA2 level.

Download and load polling places by SA1

if (!file.exists("./Data/polling-place-by-sa1s-2016.xlsx")) {
  download.file("https://www.aec.gov.au/Elections/Federal_Elections/2016/files/polling-place-by-sa1s-2016.xlsx",
                "./Data/polling-place-by-sa1s-2016.xlsx", method = "libcurl")
}

polling_place_data <-
  readxl::read_xlsx("./Data/polling-place-by-sa1s-2016.xlsx")

Aggregate polling place data to SA2

polling_place_sa2 <- polling_place_data %>%
  mutate(sa2_id = floor(SA1_id / 100)) %>%
  group_by(year, state_ab, div_nm, pp_id, pp_nm, sa2_id) %>%
  summarise(votes = sum(votes))

Combine with demographic data and aggregate

polling_place_demog <- polling_place_sa2 %>%
  mutate(sa2_id = as.character(sa2_id)) %>%
  inner_join(census_2016_all_vars)

polling_place_demog_means <- polling_place_demog %>%
  dplyr::select(year, state_ab, div_nm, pp_id, pp_nm, sa2_id, votes,
                median_age, median_household_income, average_household_size,
                persons_per_bedroom, median_weekly_rent,
                median_annual_mortgage, percent_female, percent_defacto,
                percent_married, percent_indig, percent_born_in_australia,
                percent_unit, percent_mortgage, percent_rent) %>%
  group_by(year, state_ab, div_nm, pp_id, pp_nm) %>%
  summarise_at(vars(median_age, median_household_income,
                    average_household_size, persons_per_bedroom,
                    median_weekly_rent, median_annual_mortgage, percent_female,
                    percent_defacto, percent_married, percent_indig,
                    percent_born_in_australia, percent_unit, percent_mortgage,
                    percent_rent), funs(weighted.mean(., w = votes)))

Add in 2pp at the polling booth level

election_2pp <- twoparty_pollingbooth_download()
polling_place_2pp <- polling_place_demog_means %>%
  group_by() %>%
  rename(StateAb = state_ab,
         DivisionNm = div_nm,
         PollingPlace = pp_nm,
         PollingPlaceID = pp_id) %>%
  mutate(DivisionNm = toupper(DivisionNm),
         PollingPlace = toupper(PollingPlace)) %>%
  left_join(election_2pp %>%
              filter(year == 2016))

Check for missing data

polling_place_2pp %>%
  summarise_all(funs(sum(is.na(.))))

Which booths are null?

polling_place_2pp %>%
  filter(is.na(TotalVotes)) %>%
  tabyl(PollingPlace)

So the Absent, Postals, Pre-Poll and Provisional votes aren’t in this table. Let"s come back to those…

polling_place_2pp %>%
  summarise_all(funs(sum(is.null(.))))

Remove the rows with NAs

polling_place_2pp_clean <- polling_place_2pp %>%
  filter(!is.na(TotalVotes))
polling_place_2pp_clean %>%
  summarise_all(funs(sum(is.na(.))))

Which polling stations have missing data? Not too concerned about post code, as there are some special booths

polling_place_2pp_clean %>%
  filter(is.na(median_age) | is.na(Latitude))

Looks like mobile teams and prepoll centres, and only latitude and longitude. Will remove the Brand mobile team, as the demographic data does not look valid.

polling_place_2pp_clean <- polling_place_2pp_clean %>%
  dplyr::filter(PollingPlaceID != 65161)

Look at variable summaries

skimr::skim_tee(polling_place_2pp_clean)
## -- Data Summary ------------------------
##                            Values
## Name                       data  
## Number of rows             8167  
## Number of columns          29    
## _______________________          
## Column type frequency:           
##   character                3     
##   numeric                  26    
## ________________________         
## Group variables            None  
## 
## -- Variable type: character ----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
## # A tibble: 3 x 8
##   skim_variable n_missing complete_rate   min   max empty n_unique whitespace
## * <chr>             <int>         <dbl> <int> <int> <int>    <int>      <int>
## 1 StateAb               0             1     2     3     0        8          0
## 2 DivisionNm            0             1     4    15     0      150          0
## 3 PollingPlace          0             1     3    38     0     7308          0
## 
## -- Variable type: numeric ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
## # A tibble: 26 x 11
##    skim_variable             n_missing complete_rate       mean         sd           p0         p25        p50        p75       p100 hist 
##  * <chr>                         <int>         <dbl>      <dbl>      <dbl>        <dbl>       <dbl>      <dbl>      <dbl>      <dbl> <chr>
##  1 year                              0         1      2016          0       2016         2016        2016       2016        2016     <U+2581><U+2581><U+2587><U+2581><U+2581>
##  2 PollingPlaceID                    0         1     16942.     21513.         1         3112.       6453      31664.      82815     <U+2587><U+2581><U+2581><U+2581><U+2581>
##  3 median_age                        0         1        40.3        5.47      22.9         36.3        40.0       43.9        61.8   <U+2581><U+2586><U+2587><U+2582><U+2581>
##  4 median_household_income           0         1     74568.     20909.     39294.       58443.      70395.     87600.     154668.    <U+2587><U+2587><U+2585><U+2582><U+2581>
##  5 average_household_size            0         1         2.57       0.304      1.63         2.36        2.51       2.75        5.32  <U+2583><U+2587><U+2581><U+2581><U+2581>
##  6 persons_per_bedroom               0         1         0.832      0.105      0.690        0.796       0.8        0.884       1.98  <U+2587><U+2581><U+2581><U+2581><U+2581>
##  7 median_weekly_rent                0         1       316.       110.        16.2        240         319.       382.        786.    <U+2582><U+2587><U+2587><U+2581><U+2581>
##  8 median_annual_mortgage            0         1     20753.      5631.       266.       16468.      20771.     24121.      41242.    <U+2581><U+2583><U+2587><U+2583><U+2581>
##  9 percent_female                    0         1         0.504      0.0156     0.315        0.497       0.507      0.514       0.541 <U+2581><U+2581><U+2581><U+2582><U+2587>
## 10 percent_defacto                   0         1         0.0761     0.0218     0.0202       0.0652      0.0767     0.0863      0.249 <U+2583><U+2587><U+2581><U+2581><U+2581>
## 11 percent_married                   0         1         0.346      0.0502     0.0598       0.320       0.352      0.378       0.507 <U+2581><U+2581><U+2583><U+2587><U+2581>
## 12 percent_indig                     0         1         0.0339     0.0638     0.000863     0.00816     0.0186     0.0395      0.954 <U+2587><U+2581><U+2581><U+2581><U+2581>
## 13 percent_born_in_australia         0         1         0.707      0.125      0.192        0.622       0.745      0.807       0.987 <U+2581><U+2581><U+2583><U+2587><U+2582>
## 14 percent_unit                     52         0.994     0.0869     0.137      0            0.0107      0.0302     0.0935      0.802 <U+2587><U+2581><U+2581><U+2581><U+2581>
## 15 percent_mortgage                 52         0.994     0.319      0.0873     0.00238      0.262       0.307      0.368       0.687 <U+2581><U+2585><U+2587><U+2582><U+2581>
## 16 percent_rent                     52         0.994     0.268      0.0990     0.0498       0.200       0.256      0.320       0.891 <U+2585><U+2587><U+2582><U+2581><U+2581>
## 17 LNP_Votes                         0         1       683.       845.         0          202         498        881       10094     <U+2587><U+2581><U+2581><U+2581><U+2581>
## 18 LNP_Percent                       0         1        52.1       15.7        0           41.1        52.8       63.1       100     <U+2581><U+2583><U+2587><U+2585><U+2581>
## 19 ALP_Votes                         0         1       685.       799.         0          145         523        956.       9481     <U+2587><U+2581><U+2581><U+2581><U+2581>
## 20 ALP_Percent                       0         1        47.8       15.7        0           36.9        47.2       58.8       100     <U+2581><U+2585><U+2587><U+2583><U+2581>
## 21 TotalVotes                        0         1      1368.      1549.         0          366.       1133       1840       16396     <U+2587><U+2581><U+2581><U+2581><U+2581>
## 22 Swing                             0         1        -2.90       7.61     -65.7         -6.11       -2.88       0.14       85.7   <U+2581><U+2583><U+2587><U+2581><U+2581>
## 23 Latitude                         33         0.996   -32.7        6.80     -43.4        -36.7       -33.9      -30.8        57.9   <U+2587><U+2581><U+2581><U+2581><U+2581>
## 24 Longitude                        33         0.996   144.        15.2      -95.7        145.        147.       151.        168.    <U+2581><U+2581><U+2581><U+2581><U+2587>
## 25 DivisionID                        0         1       187.        55.2      101          139         183        225         317     <U+2587><U+2587><U+2587><U+2585><U+2582>
## 26 PremisesPostCode                388         0.952  3784.      1456.       800         2539        3377       4742.       7470     <U+2581><U+2587><U+2585><U+2583><U+2582>

4 Visualising Basic Statistics

polling_place_2pp_clean %>%
  ggplot(aes(x = LNP_Percent / 100)) +
  stat_density(geom = "line", colour = "blue") +
  theme_classic(base_size = 16) +
  scale_x_continuous(labels = scales::percent) +
  labs(title = "2016 Election: LNP 2 Party Preferred Percentage",
       x = "2PP Percentage", y = "Density",
       subtitle = "by Polling Booth, Unweighted")

polling_place_2pp_clean %>%
  ggplot(aes(x = ALP_Percent / 100)) +
  stat_density(geom = "line", colour = "red") +
  theme_classic(base_size = 16) +
  scale_x_continuous(labels = scales::percent) +
  labs(title = "2016 Election: ALP 2 Party Preferred Percentage",
       x = "2PP Percentage", y = "Density",
       subtitle = "by Polling Booth, Unweighted")

polling_place_2pp_clean %>%
  ggplot(aes(x = Swing / 100)) +
  stat_density(geom = "line", colour = "purple") +
  theme_classic(base_size = 16) +
  scale_x_continuous(labels = scales::percent) +
  labs(title = "2016 Election: Swing to Incumbent", x = "Swing",
       y = "Density", subtitle = "by Polling Booth, Unweighted")

polling_place_2pp_clean %>%
  ggplot(aes(x = median_household_income)) +
  stat_density(geom = "line", colour = "purple") +
  theme_classic(base_size = 16) +
  scale_x_continuous(labels = scales::dollar) +
  labs(title = "2016 Census: Median Income", x = "Median Income",
       y = "Density", subtitle = "by Polling Booth, Unweighted")

4.1 State Breakdowns

Can we look at these distributions by state?

polling_place_2pp_clean %>%
  ggplot(aes(x = ALP_Percent / 100, colour = StateAb)) +
  stat_density(geom = "line", position = "dodge") +
  theme_classic(base_size = 16) + scale_color_brewer(palette = "Dark2") +
  scale_x_continuous(labels = scales::percent) +
  labs(title = "2016 Election: ALP 2 Party Preferred Percentage",
       x = "2PP Percentage", y = "Frequency",
       subtitle = "by Polling Booth, Unweighted")

What about by median income

polling_place_2pp_clean %>%
  ggplot(aes(y = ALP_Percent / 100, x = median_household_income,
             colour = StateAb)) +
  geom_point() +
  theme_classic(base_size = 16) + scale_color_brewer(palette = "Dark2") +
  scale_y_continuous(labels = scales::percent) +
  scale_x_continuous(labels = scales::dollar) +
  labs(title = "2016 Election: ALP 2 Party Preferred Percentage",
       x = "Booth Median Income",
       y = "ALP 2pp Percentage", subtitle = "by Polling Booth, Unweighted") +
  facet_wrap(~StateAb, nrow = 4)

What about comparing NSW electorates? There seems to be an odd separation in income bands for low ALP 2pp. Could this be a regional vs city difference?

fp_booth_16 <- firstpref_pollingbooth_download() %>%
  filter(year == 2016)
polling_2cp <- read_csv("https://results.aec.gov.au/20499/Website/Downloads/HouseTcpByCandidateByPollingPlaceDownload-20499.csv", skip = 1)

polling_2pp <- read_csv("https://results.aec.gov.au/20499/Website/Downloads/HouseTppByPollingPlaceDownload-20499.csv", skip = 1)

fp_booth_2016 <- read_csv("https://results.aec.gov.au/20499/Website/Downloads/HouseStateFirstPrefsByPollingPlaceDownload-20499-NSW.csv", skip = 1) %>%
  rbind(read_csv("https://results.aec.gov.au/20499/Website/Downloads/HouseStateFirstPrefsByPollingPlaceDownload-20499-VIC.csv", skip = 1)) %>%
  rbind(read_csv("https://results.aec.gov.au/20499/Website/Downloads/HouseStateFirstPrefsByPollingPlaceDownload-20499-QLD.csv", skip = 1)) %>%
  rbind(read_csv("https://results.aec.gov.au/20499/Website/Downloads/HouseStateFirstPrefsByPollingPlaceDownload-20499-SA.csv", skip = 1)) %>%
  rbind(read_csv("https://results.aec.gov.au/20499/Website/Downloads/HouseStateFirstPrefsByPollingPlaceDownload-20499-WA.csv", skip = 1)) %>%
  rbind(read_csv("https://results.aec.gov.au/20499/Website/Downloads/HouseStateFirstPrefsByPollingPlaceDownload-20499-TAS.csv", skip = 1)) %>%
  rbind(read_csv("https://results.aec.gov.au/20499/Website/Downloads/HouseStateFirstPrefsByPollingPlaceDownload-20499-NT.csv", skip = 1)) %>%
  rbind(read_csv("https://results.aec.gov.au/20499/Website/Downloads/HouseStateFirstPrefsByPollingPlaceDownload-20499-ACT.csv", skip = 1))
coalition_contest_2016 <- fp_booth_2016 %>%
  group_by(DivisionNm, PartyNm, HistoricElected) %>%
  summarise(OrdinaryVotes = sum(OrdinaryVotes)) %>%
  filter(PartyNm %in% c("Liberal", "Country Liberals (NT)",
                        "Liberal National Party of Queensland",
                        "The Nationals")) %>%
  group_by(DivisionNm) %>%
  top_n(1) %>%
  dplyr::select(DivisionNm, PartyNm)

If we look at a couple of the states where high income booths tend to vote strongly for the coalition as well as lower income booths, we can see that some (but not all) of the lower income booths are contested by The Nationals. This indicares (not surprisingly) that Nationals voters and Liberal voters are different socio-economically, or possibly that city and country coalition voters differ.

polling_place_2pp_clean %>%
  mutate(DivisionNm = stringr::str_to_title(DivisionNm)) %>%
  inner_join(coalition_contest_2016) %>%
  filter(StateAb == "NSW") %>%
  ggplot(aes(y = ALP_Percent / 100, x = median_household_income,
             colour = PartyNm)) +
  geom_point(size = 3) +
  theme_classic(base_size = 16) +
  scale_color_manual(values = c("blue", "dark green")) +
  theme(legend.position = "bottom") +
  scale_y_continuous(labels = scales::percent) +
  scale_x_continuous(labels = scales::dollar) +
  labs(title = "2016 Election: ALP 2 Party Preferred Percentage",
       x = "Booth Median Income",
       y = "ALP 2pp Percentage", colour = "Coalition Party",
       subtitle = "by Polling Booth, Unweighted (NSW)")

polling_place_2pp_clean %>%
  mutate(DivisionNm = stringr::str_to_title(DivisionNm)) %>%
  inner_join(coalition_contest_2016) %>%
  filter(StateAb == "VIC") %>%
  ggplot(aes(y = ALP_Percent / 100, x = median_household_income,
             colour = PartyNm)) +
  geom_point(size = 3) +
  theme_classic(base_size = 16) +
  scale_color_manual(values = c("blue", "dark green")) +
  theme(legend.position = "bottom") +
  scale_y_continuous(labels = scales::percent) +
  scale_x_continuous(labels = scales::dollar) +
  labs(title = "2016 Election: ALP 2 Party Preferred Percentage",
       x = "Booth Median Income",
       y = "ALP 2pp Percentage", colour = "Coalition Party",
       subtitle = "by Polling Booth, Unweighted (VIC)")

This effect is less clear in states where the Nationals aren’t as prominent, either because the Nationals aren’t as prominent (SA, WA, TAS), or are merged with the Liberals (QLD).

polling_place_2pp_clean %>%
  mutate(DivisionNm = stringr::str_to_title(DivisionNm)) %>%
  inner_join(coalition_contest_2016) %>%
  filter(StateAb == "WA") %>%
  ggplot(aes(y = ALP_Percent / 100, x = median_household_income,
             colour = PartyNm)) +
  geom_point(size = 3) +
  theme_classic(base_size = 16) +
  scale_color_manual(values = c("blue", "dark green")) +
  theme(legend.position = "bottom") +
  scale_y_continuous(labels = scales::percent) +
  scale_x_continuous(labels = scales::dollar) +
  labs(title = "2016 Election: ALP 2 Party Preferred Percentage",
       x = "Booth Median Income",
       y = "ALP 2pp Percentage", colour = "Coalition Party",
       subtitle = "by Polling Booth, Unweighted (WA)")

Perhaps we would be better off using the geographical classifications from the AEC.

webpage <-
  read_html("http://results.aec.gov.au/20499/Website/HouseDivisionClassifications-20499-NAT.htm")

division_classifications <- webpage %>%
  html_nodes("#divisionClassifications") %>%
  html_table(fill = TRUE) %>%
  .[[1]]

division_classifications <- division_classifications %>%
  filter(Division != "Total Enrolment")

polling_place_2pp_clean <- polling_place_2pp_clean %>%
  mutate(Division = stringr::str_to_title(DivisionNm)) %>%
  inner_join(division_classifications)

The graph below shows that the booths that have a low ALP 2pp and a low median income are primarily rural booths. This relationship seems stronger than the Lib/Nat split.

polling_place_2pp_clean %>%
  filter(StateAb == "NSW") %>%
  ggplot(aes(y = ALP_Percent / 100, x = median_household_income,
             colour = Demographic)) +
  geom_point(size = 1) +
  theme_classic(base_size = 16) + scale_color_brewer(palette = "Dark2") +
  theme(legend.position = "bottom") +
  scale_y_continuous(labels = scales::percent) +
  scale_x_continuous(labels = scales::dollar) +
  labs(title = "2016 Election: ALP 2 Party Preferred Percentage",
       y = "ALP 2pp Percentage",
       x = "Booth Median Income", colour = "Region",
       subtitle = "by Polling Booth, Unweighted (NSW)")

Looking at all states we see a similar relationship, although less strong than in NSW.

polling_place_2pp_clean %>%
  ggplot(aes(y = ALP_Percent / 100, x = median_household_income,
             colour = Demographic)) +
  geom_point(size = 1) +
  theme_classic(base_size = 16) + scale_color_brewer(palette = "Dark2") +
  theme(legend.position = "bottom") +
  scale_y_continuous(labels = scales::percent) +
  scale_x_continuous(labels = scales::dollar) +
  labs(title = "2016 Election: ALP 2 Party Preferred Percentage",
       y = "ALP 2pp Percentage",
       x = "Median Booth Income", colour = "Region",
       subtitle = "by Polling Booth, Unweighted")

What about some of the other variables?

polling_place_2pp_clean %>%
  ggplot(aes(y = ALP_Percent / 100, x = average_household_size,
             colour = Demographic)) +
  geom_point(size = 1) +
  theme_classic(base_size = 16) + scale_color_brewer(palette = "Dark2") +
  theme(legend.position = "bottom") +
  scale_y_continuous(labels = scales::percent) +
  labs(title = "2016 Election: ALP 2 Party Preferred Percentage",
       y = "ALP 2pp Percentage",
       x = "Average Household Size", colour = "Region",
       subtitle = "by Polling Booth, Unweighted")

polling_place_2pp_clean %>%
  ggplot(aes(y = ALP_Percent / 100, x = percent_female, colour = Demographic)) +
  geom_point(size = 1) +
  theme_classic(base_size = 16) + scale_color_brewer(palette = "Dark2") +
  theme(legend.position = "bottom") +
  scale_x_continuous(labels = scales::percent) +
  scale_y_continuous(labels = scales::percent) +
  labs(title = "2016 Election: ALP 2 Party Preferred Percentage",
       y = "ALP 2pp Percentage",
       x = "Percent Female", colour = "Region",
       subtitle = "by Polling Booth, Unweighted")

polling_place_2pp_clean %>%
  dplyr::select(ALP_Percent, Swing, median_age, median_household_income,
                average_household_size, persons_per_bedroom, median_weekly_rent,
                median_annual_mortgage, percent_female, percent_defacto,
                percent_born_in_australia,
                percent_unit, percent_mortgage, percent_rent) %>%
  cor %>%
  kable()
ALP_Percent Swing median_age median_household_income average_household_size persons_per_bedroom median_weekly_rent median_annual_mortgage percent_female percent_defacto percent_born_in_australia percent_unit percent_mortgage percent_rent
ALP_Percent 1.0000000 -0.2912343 -0.4107650 -0.0056240 0.1653612 0.3613836 0.1525497 0.1068924 0.1182998 0.1417798 -0.3283471 NA NA NA
Swing -0.2912343 1.0000000 0.0514799 0.1105950 -0.0781497 0.0236802 0.0899012 0.1048528 0.0807160 -0.0001612 -0.0461574 NA NA NA
median_age -0.4107650 0.0514799 1.0000000 -0.4776289 -0.5107579 -0.5931916 -0.4041044 -0.4171832 -0.0537144 -0.0620614 0.5213940 NA NA NA
median_household_income -0.0056240 0.1105950 -0.4776289 1.0000000 0.4138939 0.3353107 0.8079796 0.8679774 0.2015761 -0.0669486 -0.4025008 NA NA NA
average_household_size 0.1653612 -0.0781497 -0.5107579 0.4138939 1.0000000 0.3779623 0.3332586 0.3183454 -0.0068105 -0.5133083 -0.2969940 NA NA NA
persons_per_bedroom 0.3613836 0.0236802 -0.5931916 0.3353107 0.3779623 1.0000000 0.4035784 0.3926183 -0.0036236 0.0290107 -0.5889513 NA NA NA
median_weekly_rent 0.1525497 0.0899012 -0.4041044 0.8079796 0.3332586 0.4035784 1.0000000 0.9280559 0.4262035 -0.1258483 -0.6024308 NA NA NA
median_annual_mortgage 0.1068924 0.1048528 -0.4171832 0.8679774 0.3183454 0.3926183 0.9280559 1.0000000 0.3597541 -0.1104388 -0.5512846 NA NA NA
percent_female 0.1182998 0.0807160 -0.0537144 0.2015761 -0.0068105 -0.0036236 0.4262035 0.3597541 1.0000000 -0.0999105 -0.1133221 NA NA NA
percent_defacto 0.1417798 -0.0001612 -0.0620614 -0.0669486 -0.5133083 0.0290107 -0.1258483 -0.1104388 -0.0999105 1.0000000 0.2271308 NA NA NA
percent_born_in_australia -0.3283471 -0.0461574 0.5213940 -0.4025008 -0.2969940 -0.5889513 -0.6024308 -0.5512846 -0.1133221 0.2271308 1.0000000 NA NA NA
percent_unit NA NA NA NA NA NA NA NA NA NA NA 1 NA NA
percent_mortgage NA NA NA NA NA NA NA NA NA NA NA NA 1 NA
percent_rent NA NA NA NA NA NA NA NA NA NA NA NA NA 1

Note that there appear to be lots of NAs for the housing data. Need to fix/impute.

polling_place_2pp_clean %>%
  ggplot(aes(y = ALP_Percent / 100, x = persons_per_bedroom,
             colour = Demographic)) +
  geom_point(size = 1) +
  theme_classic(base_size = 16) + scale_color_brewer(palette = "Dark2") +
  theme(legend.position = "bottom") +
  scale_y_continuous(labels = scales::percent) +
  labs(title = "2016 Election: ALP 2 Party Preferred Percentage",
       y = "ALP 2pp Percentage",
       x = "Persons per Bedroom", colour = "Region",
       subtitle = "by Polling Booth, Unweighted")

polling_place_2pp_clean %>%
  ggplot(aes(y = ALP_Percent / 100, x = percent_unit, colour = Demographic)) +
  geom_point(size = 1) +
  theme_classic(base_size = 16) + scale_color_brewer(palette = "Dark2") +
  theme(legend.position = "bottom") +
  scale_y_continuous(labels = scales::percent) +
  scale_x_continuous(labels = scales::percent) +
  labs(title = "2016 Election: ALP 2 Party Preferred Percentage",
       y = "ALP 2pp Percentage",
       x = "Percent of Dwellings - Unit", colour = "Region",
       subtitle = "by Polling Booth, Unweighted")

polling_place_2pp_clean %>%
  ggplot(aes(y = ALP_Percent / 100, x = percent_mortgage,
             colour = Demographic)) +
  geom_point(size = 1) +
  theme_classic(base_size = 16) + scale_color_brewer(palette = "Dark2") +
  theme(legend.position = "bottom") +
  scale_y_continuous(labels = scales::percent) +
  scale_x_continuous(labels = scales::percent) +
  labs(title = "2016 Election: ALP 2 Party Preferred Percentage",
       y = "ALP 2pp Percentage",
       x = "Percent of Dwellings - Under Mortgage", colour = "Region",
       subtitle = "by Polling Booth, Unweighted")

polling_place_2pp_clean %>%
  ggplot(aes(y = ALP_Percent / 100, x = percent_rent, colour = Demographic)) +
  geom_point(size = 1) +
  theme_classic(base_size = 16) + scale_color_brewer(palette = "Dark2") +
  theme(legend.position = "bottom") +
  scale_y_continuous(labels = scales::percent) +
  scale_x_continuous(labels = scales::percent) +
  labs(title = "2016 Election: ALP 2 Party Preferred Percentage",
       y = "ALP 2pp Percentage",
       x = "Percent of Dwellings - Renting", colour = "Region",
       subtitle = "by Polling Booth, Unweighted")

polling_place_2pp_clean %>%
  ggplot(aes(y = ALP_Percent / 100, x = percent_indig, colour = Demographic)) +
  geom_point(size = 1) +
  theme_classic(base_size = 16) + scale_color_brewer(palette = "Dark2") +
  theme(legend.position = "bottom") +
  scale_y_continuous(labels = scales::percent) +
  scale_x_continuous(labels = scales::percent) +
  labs(title = "2016 Election: ALP 2 Party Preferred Percentage",
       y = "ALP 2pp Percentage",
       x = "Percent Indigeneous", colour = "Region",
       subtitle = "by Polling Booth, Unweighted")

polling_place_2pp_clean %>%
  ggplot(aes(y = ALP_Percent / 100, x = percent_born_in_australia,
             colour = Demographic)) +
  geom_point(size = 1) +
  theme_classic(base_size = 16) + scale_color_brewer(palette = "Dark2") +
  theme(legend.position = "bottom") +
  scale_y_continuous(labels = scales::percent) +
  scale_x_continuous(labels = scales::percent) +
  labs(title = "2016 Election: ALP 2 Party Preferred Percentage",
       y = "ALP 2pp Percentage",
       x = "Percent Born in Australia", colour = "Region",
       subtitle = "by Polling Booth, Unweighted")

polling_place_2pp_clean %>%
  ggplot(aes(y = ALP_Percent / 100, x = percent_defacto,
             colour = Demographic)) +
  geom_point(size = 1) +
  theme_classic(base_size = 16) + scale_color_brewer(palette = "Dark2") +
  theme(legend.position = "bottom") +
  scale_y_continuous(labels = scales::percent) +
  scale_x_continuous(labels = scales::percent) +
  labs(title = "2016 Election: ALP 2 Party Preferred Percentage",
       y = "ALP 2pp Percentage",
       x = "Percent in a Defacto Relationship", colour = "Region",
       subtitle = "by Polling Booth, Unweighted")

5 What to do next?

Add extra variables Look at lagged results Build models

6 Simple Model Building

6.1 Linear Models

Can we build a simple linear model to predict 2pp

alp_2pp_lm_demog <-
  lm(ALP_Percent ~ median_household_income * Demographic + percent_indig +
       percent_female + percent_defacto + percent_born_in_australia +
       percent_rent + median_weekly_rent + median_age,
     data = polling_place_2pp_clean)

summary(alp_2pp_lm_demog)
## 
## Call:
## lm(formula = ALP_Percent ~ median_household_income * Demographic + 
##     percent_indig + percent_female + percent_defacto + percent_born_in_australia + 
##     percent_rent + median_weekly_rent + median_age, data = polling_place_2pp_clean)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -63.536  -8.043  -0.237   7.809  73.017 
## 
## Coefficients:
##                                                            Estimate    Std. Error t value             Pr(>|t|)    
## (Intercept)                                           109.644469038   5.585384199  19.631 < 0.0000000000000002 ***
## median_household_income                                -0.000517611   0.000017675 -29.284 < 0.0000000000000002 ***
## DemographicOuter Metropolitan                          -1.583368327   1.865918048  -0.849              0.39614    
## DemographicProvincial                                 -10.724897471   2.380920835  -4.505        0.00000674955 ***
## DemographicRural                                      -36.155148550   1.977005482 -18.288 < 0.0000000000000002 ***
## percent_indig                                          16.501075768   2.761240592   5.976        0.00000000239 ***
## percent_female                                         16.762816547  11.349092093   1.477              0.13971    
## percent_defacto                                       151.352869834   7.954650274  19.027 < 0.0000000000000002 ***
## percent_born_in_australia                              -9.365438012   2.328767479  -4.022        0.00005834398 ***
## percent_rent                                          -12.770771154   2.509964253  -5.088        0.00000037027 ***
## median_weekly_rent                                      0.032011772   0.003118428  10.265 < 0.0000000000000002 ***
## median_age                                             -0.890392784   0.046220782 -19.264 < 0.0000000000000002 ***
## median_household_income:DemographicOuter Metropolitan  -0.000003321   0.000021314  -0.156              0.87619    
## median_household_income:DemographicProvincial           0.000083316   0.000030512   2.731              0.00634 ** 
## median_household_income:DemographicRural                0.000309088   0.000024982  12.372 < 0.0000000000000002 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 12.15 on 7775 degrees of freedom
##   (52 observations deleted due to missingness)
## Multiple R-squared:  0.3995, Adjusted R-squared:  0.3984 
## F-statistic: 369.5 on 14 and 7775 DF,  p-value: < 0.00000000000000022
anova(alp_2pp_lm_demog)
plot(alp_2pp_lm_demog)

vif(alp_2pp_lm_demog)
##                                             GVIF Df GVIF^(1/(2*Df))
## median_household_income                 7.316624  1        2.704926
## Demographic                         18261.455172  3        5.131637
## percent_indig                           1.706153  1        1.306198
## percent_female                          1.650000  1        1.284523
## percent_defacto                         1.614552  1        1.270650
## percent_born_in_australia               4.563213  1        2.136168
## percent_rent                            3.299426  1        1.816432
## median_weekly_rent                      6.269462  1        2.503889
## median_age                              3.407630  1        1.845977
## median_household_income:Demographic 13002.761451  3        4.849228
gv_alp_2pp_lm_demog <- gvlma(alp_2pp_lm_demog)
summary(gv_alp_2pp_lm_demog)
## 
## Call:
## lm(formula = ALP_Percent ~ median_household_income * Demographic + 
##     percent_indig + percent_female + percent_defacto + percent_born_in_australia + 
##     percent_rent + median_weekly_rent + median_age, data = polling_place_2pp_clean)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -63.536  -8.043  -0.237   7.809  73.017 
## 
## Coefficients:
##                                                            Estimate    Std. Error t value             Pr(>|t|)    
## (Intercept)                                           109.644469038   5.585384199  19.631 < 0.0000000000000002 ***
## median_household_income                                -0.000517611   0.000017675 -29.284 < 0.0000000000000002 ***
## DemographicOuter Metropolitan                          -1.583368327   1.865918048  -0.849              0.39614    
## DemographicProvincial                                 -10.724897471   2.380920835  -4.505        0.00000674955 ***
## DemographicRural                                      -36.155148550   1.977005482 -18.288 < 0.0000000000000002 ***
## percent_indig                                          16.501075768   2.761240592   5.976        0.00000000239 ***
## percent_female                                         16.762816547  11.349092093   1.477              0.13971    
## percent_defacto                                       151.352869834   7.954650274  19.027 < 0.0000000000000002 ***
## percent_born_in_australia                              -9.365438012   2.328767479  -4.022        0.00005834398 ***
## percent_rent                                          -12.770771154   2.509964253  -5.088        0.00000037027 ***
## median_weekly_rent                                      0.032011772   0.003118428  10.265 < 0.0000000000000002 ***
## median_age                                             -0.890392784   0.046220782 -19.264 < 0.0000000000000002 ***
## median_household_income:DemographicOuter Metropolitan  -0.000003321   0.000021314  -0.156              0.87619    
## median_household_income:DemographicProvincial           0.000083316   0.000030512   2.731              0.00634 ** 
## median_household_income:DemographicRural                0.000309088   0.000024982  12.372 < 0.0000000000000002 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 12.15 on 7775 degrees of freedom
##   (52 observations deleted due to missingness)
## Multiple R-squared:  0.3995, Adjusted R-squared:  0.3984 
## F-statistic: 369.5 on 14 and 7775 DF,  p-value: < 0.00000000000000022
## 
## 
## ASSESSMENT OF THE LINEAR MODEL ASSUMPTIONS
## USING THE GLOBAL TEST ON 4 DEGREES-OF-FREEDOM:
## Level of Significance =  0.05 
## 
## Call:
##  gvlma(x = alp_2pp_lm_demog) 
## 
##                      Value      p-value                   Decision
## Global Stat        329.183 0.0000000000 Assumptions NOT satisfied!
## Skewness            24.285 0.0000008309 Assumptions NOT satisfied!
## Kurtosis           271.406 0.0000000000 Assumptions NOT satisfied!
## Link Function        9.194 0.0024278954 Assumptions NOT satisfied!
## Heteroscedasticity  24.298 0.0000008254 Assumptions NOT satisfied!
alp_2pp_lm_demog <-
  lm(ALP_Percent ~ median_household_income + Demographic + percent_indig +
       percent_defacto + percent_born_in_australia +
       percent_rent + median_weekly_rent + median_age,
     data = polling_place_2pp_clean)

summary(alp_2pp_lm_demog)
## 
## Call:
## lm(formula = ALP_Percent ~ median_household_income + Demographic + 
##     percent_indig + percent_defacto + percent_born_in_australia + 
##     percent_rent + median_weekly_rent + median_age, data = polling_place_2pp_clean)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -63.420  -8.135  -0.258   8.075  82.895 
## 
## Coefficients:
##                                   Estimate   Std. Error t value             Pr(>|t|)    
## (Intercept)                   123.52168682   2.62448183  47.065 < 0.0000000000000002 ***
## median_household_income        -0.00044876   0.00001369 -32.780 < 0.0000000000000002 ***
## DemographicOuter Metropolitan  -1.56733045   0.46215084  -3.391             0.000699 ***
## DemographicProvincial          -2.69135517   0.62742024  -4.290       0.000018119492 ***
## DemographicRural              -14.17052058   0.60808623 -23.303 < 0.0000000000000002 ***
## percent_indig                  12.91963465   2.76941622   4.665       0.000003136028 ***
## percent_defacto               152.44664157   7.82591302  19.480 < 0.0000000000000002 ***
## percent_born_in_australia     -11.21826628   2.14098299  -5.240       0.000000164993 ***
## percent_rent                  -15.79729218   2.43979124  -6.475       0.000000000101 ***
## median_weekly_rent              0.02988500   0.00276810  10.796 < 0.0000000000000002 ***
## median_age                     -1.12325830   0.04174571 -26.907 < 0.0000000000000002 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 12.3 on 7779 degrees of freedom
##   (52 observations deleted due to missingness)
## Multiple R-squared:  0.3846, Adjusted R-squared:  0.3838 
## F-statistic: 486.2 on 10 and 7779 DF,  p-value: < 0.00000000000000022
anova(alp_2pp_lm_demog)
plot(alp_2pp_lm_demog)

vif(alp_2pp_lm_demog)
##                               GVIF Df GVIF^(1/(2*Df))
## median_household_income   4.285242  1        2.070083
## Demographic               4.657881  3        1.292304
## percent_indig             1.675652  1        1.294470
## percent_defacto           1.525730  1        1.235204
## percent_born_in_australia 3.765674  1        1.940534
## percent_rent              3.043732  1        1.744629
## median_weekly_rent        4.823036  1        2.196141
## median_age                2.713934  1        1.647402
crPlots(alp_2pp_lm_demog)

ceresPlots(alp_2pp_lm_demog)

gv_alp_2pp_lm_demog <- gvlma(alp_2pp_lm_demog)
summary(gv_alp_2pp_lm_demog)
## 
## Call:
## lm(formula = ALP_Percent ~ median_household_income + Demographic + 
##     percent_indig + percent_defacto + percent_born_in_australia + 
##     percent_rent + median_weekly_rent + median_age, data = polling_place_2pp_clean)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -63.420  -8.135  -0.258   8.075  82.895 
## 
## Coefficients:
##                                   Estimate   Std. Error t value             Pr(>|t|)    
## (Intercept)                   123.52168682   2.62448183  47.065 < 0.0000000000000002 ***
## median_household_income        -0.00044876   0.00001369 -32.780 < 0.0000000000000002 ***
## DemographicOuter Metropolitan  -1.56733045   0.46215084  -3.391             0.000699 ***
## DemographicProvincial          -2.69135517   0.62742024  -4.290       0.000018119492 ***
## DemographicRural              -14.17052058   0.60808623 -23.303 < 0.0000000000000002 ***
## percent_indig                  12.91963465   2.76941622   4.665       0.000003136028 ***
## percent_defacto               152.44664157   7.82591302  19.480 < 0.0000000000000002 ***
## percent_born_in_australia     -11.21826628   2.14098299  -5.240       0.000000164993 ***
## percent_rent                  -15.79729218   2.43979124  -6.475       0.000000000101 ***
## median_weekly_rent              0.02988500   0.00276810  10.796 < 0.0000000000000002 ***
## median_age                     -1.12325830   0.04174571 -26.907 < 0.0000000000000002 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 12.3 on 7779 degrees of freedom
##   (52 observations deleted due to missingness)
## Multiple R-squared:  0.3846, Adjusted R-squared:  0.3838 
## F-statistic: 486.2 on 10 and 7779 DF,  p-value: < 0.00000000000000022
## 
## 
## ASSESSMENT OF THE LINEAR MODEL ASSUMPTIONS
## USING THE GLOBAL TEST ON 4 DEGREES-OF-FREEDOM:
## Level of Significance =  0.05 
## 
## Call:
##  gvlma(x = alp_2pp_lm_demog) 
## 
##                     Value               p-value                   Decision
## Global Stat        514.92 0.0000000000000000000 Assumptions NOT satisfied!
## Skewness            32.18 0.0000000140185580921 Assumptions NOT satisfied!
## Kurtosis           377.92 0.0000000000000000000 Assumptions NOT satisfied!
## Link Function       65.68 0.0000000000000005551 Assumptions NOT satisfied!
## Heteroscedasticity  39.14 0.0000000003949938154 Assumptions NOT satisfied!

We know that income is important in the non-rural areas, so it might be worth adding an interaction between rural and non-rural and income.

polling_place_2pp_clean <- polling_place_2pp_clean %>%
  mutate(NonRural_Demographic = if_else(Demographic == "Rural", 1, 0),
         ALP_Percent_2013 = ALP_Percent + Swing)

alp_2pp_lm_demog <-
  lm(ALP_Percent ~ median_household_income + Demographic + percent_indig +
       percent_defacto + percent_born_in_australia +
       percent_rent + median_weekly_rent + median_age +
       NonRural_Demographic:median_household_income,
     data = polling_place_2pp_clean)

summary(alp_2pp_lm_demog)
## 
## Call:
## lm(formula = ALP_Percent ~ median_household_income + Demographic + 
##     percent_indig + percent_defacto + percent_born_in_australia + 
##     percent_rent + median_weekly_rent + median_age + NonRural_Demographic:median_household_income, 
##     data = polling_place_2pp_clean)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -63.412  -8.045  -0.263   7.812  73.408 
## 
## Coefficients:
##                                                  Estimate   Std. Error t value             Pr(>|t|)    
## (Intercept)                                  117.94422059   2.62733142  44.891 < 0.0000000000000002 ***
## median_household_income                       -0.00051275   0.00001434 -35.751 < 0.0000000000000002 ***
## DemographicOuter Metropolitan                 -1.91629900   0.45761011  -4.188   0.0000285000653248 ***
## DemographicProvincial                         -4.78416304   0.63940377  -7.482   0.0000000000000811 ***
## DemographicRural                             -35.02665851   1.66019796 -21.098 < 0.0000000000000002 ***
## percent_indig                                 15.73666896   2.74578033   5.731   0.0000000103434805 ***
## percent_defacto                              149.07958037   7.74063935  19.259 < 0.0000000000000002 ***
## percent_born_in_australia                     -8.31193372   2.12750867  -3.907   0.0000942855057573 ***
## percent_rent                                 -13.02413620   2.42071067  -5.380   0.0000000765292289 ***
## median_weekly_rent                             0.03336423   0.00274866  12.138 < 0.0000000000000002 ***
## median_age                                    -0.91972431   0.04394585 -20.929 < 0.0000000000000002 ***
## median_household_income:NonRural_Demographic   0.00029327   0.00002176  13.477 < 0.0000000000000002 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 12.16 on 7778 degrees of freedom
##   (52 observations deleted due to missingness)
## Multiple R-squared:  0.3987, Adjusted R-squared:  0.3978 
## F-statistic: 468.8 on 11 and 7778 DF,  p-value: < 0.00000000000000022
anova(alp_2pp_lm_demog)
plot(alp_2pp_lm_demog)

vif(alp_2pp_lm_demog)
##                                                   GVIF Df GVIF^(1/(2*Df))
## median_household_income                       4.812683  1        2.193783
## Demographic                                  62.371236  3        1.991425
## percent_indig                                 1.685419  1        1.298237
## percent_defacto                               1.527321  1        1.235848
## percent_born_in_australia                     3.804766  1        1.950581
## percent_rent                                  3.065885  1        1.750967
## median_weekly_rent                            4.865961  1        2.205892
## median_age                                    3.077376  1        1.754245
## median_household_income:NonRural_Demographic 23.061591  1        4.802249
gv_alp_2pp_lm_demog <- gvlma(alp_2pp_lm_demog)
summary(gv_alp_2pp_lm_demog)
## 
## Call:
## lm(formula = ALP_Percent ~ median_household_income + Demographic + 
##     percent_indig + percent_defacto + percent_born_in_australia + 
##     percent_rent + median_weekly_rent + median_age + NonRural_Demographic:median_household_income, 
##     data = polling_place_2pp_clean)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -63.412  -8.045  -0.263   7.812  73.408 
## 
## Coefficients:
##                                                  Estimate   Std. Error t value             Pr(>|t|)    
## (Intercept)                                  117.94422059   2.62733142  44.891 < 0.0000000000000002 ***
## median_household_income                       -0.00051275   0.00001434 -35.751 < 0.0000000000000002 ***
## DemographicOuter Metropolitan                 -1.91629900   0.45761011  -4.188   0.0000285000653248 ***
## DemographicProvincial                         -4.78416304   0.63940377  -7.482   0.0000000000000811 ***
## DemographicRural                             -35.02665851   1.66019796 -21.098 < 0.0000000000000002 ***
## percent_indig                                 15.73666896   2.74578033   5.731   0.0000000103434805 ***
## percent_defacto                              149.07958037   7.74063935  19.259 < 0.0000000000000002 ***
## percent_born_in_australia                     -8.31193372   2.12750867  -3.907   0.0000942855057573 ***
## percent_rent                                 -13.02413620   2.42071067  -5.380   0.0000000765292289 ***
## median_weekly_rent                             0.03336423   0.00274866  12.138 < 0.0000000000000002 ***
## median_age                                    -0.91972431   0.04394585 -20.929 < 0.0000000000000002 ***
## median_household_income:NonRural_Demographic   0.00029327   0.00002176  13.477 < 0.0000000000000002 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 12.16 on 7778 degrees of freedom
##   (52 observations deleted due to missingness)
## Multiple R-squared:  0.3987, Adjusted R-squared:  0.3978 
## F-statistic: 468.8 on 11 and 7778 DF,  p-value: < 0.00000000000000022
## 
## 
## ASSESSMENT OF THE LINEAR MODEL ASSUMPTIONS
## USING THE GLOBAL TEST ON 4 DEGREES-OF-FREEDOM:
## Level of Significance =  0.05 
## 
## Call:
##  gvlma(x = alp_2pp_lm_demog) 
## 
##                     Value      p-value                   Decision
## Global Stat        340.87 0.0000000000 Assumptions NOT satisfied!
## Skewness            25.94 0.0000003526 Assumptions NOT satisfied!
## Kurtosis           277.29 0.0000000000 Assumptions NOT satisfied!
## Link Function       11.62 0.0006526462 Assumptions NOT satisfied!
## Heteroscedasticity  26.03 0.0000003362 Assumptions NOT satisfied!

6.2 Lasso Regression

Using a lasso regression,

polling_place_2pp_clean_na <- na.omit(polling_place_2pp_clean) %>%
  dplyr::select(ALP_Percent, StateAb, median_age, median_household_income,
                average_household_size, persons_per_bedroom, median_weekly_rent,
                median_annual_mortgage,  percent_female, percent_defacto,
                percent_married, percent_indig, percent_born_in_australia,
                percent_unit, percent_mortgage, percent_rent, Demographic)
# Inspect the data
sample_n(polling_place_2pp_clean_na, 3)
# Split the data into training and test set
set.seed(123)
training_samples <- polling_place_2pp_clean_na$ALP_Percent %>%
  createDataPartition(p = 0.8, list = FALSE)
train_data  <- polling_place_2pp_clean_na[training_samples, ]
test_data <- polling_place_2pp_clean_na[-training_samples, ]

# Dummy code categorical predictor variables
x <- model.matrix(ALP_Percent~., train_data)[, -1]
# Convert the outcome (class) to a numerical variable
y <- train_data$ALP_Percent

cv_lasso <- cv.glmnet(x, y, alpha = 1, family = "gaussian")

# Fit the final model on the training data
alp_2pp_lasso_demog <- glmnet(x, y, alpha = 1, family = "gaussian",
                              lambda = cv_lasso$lambda.min)
# Display regression coefficients
coef(alp_2pp_lasso_demog)
## 25 x 1 sparse Matrix of class "dgCMatrix"
##                                           s0
## (Intercept)                    15.0684175489
## StateAbNSW                    -10.5408695179
## StateAbNT                      -7.8046459078
## StateAbQLD                    -16.3022362445
## StateAbSA                     -12.4863080849
## StateAbTAS                     -7.5333433685
## StateAbVIC                    -13.1650588102
## StateAbWA                     -20.0779651971
## median_age                      0.3361809822
## median_household_income        -0.0002954546
## average_household_size          6.4905287857
## persons_per_bedroom            35.6780738148
## median_weekly_rent              0.0139127938
## median_annual_mortgage          0.0001118316
## percent_female                 35.7858791409
## percent_defacto               171.9936601055
## percent_married               -81.0160731481
## percent_indig                 -42.2185144180
## percent_born_in_australia     -23.5404498825
## percent_unit                  -48.7470498762
## percent_mortgage               46.8260925422
## percent_rent                   45.3005289589
## DemographicOuter Metropolitan  -3.1254361561
## DemographicProvincial          -3.2690834206
## DemographicRural              -13.6374572080
summary(alp_2pp_lasso_demog)
##           Length Class     Mode   
## a0         1     -none-    numeric
## beta      24     dgCMatrix S4     
## df         1     -none-    numeric
## dim        2     -none-    numeric
## lambda     1     -none-    numeric
## dev.ratio  1     -none-    numeric
## nulldev    1     -none-    numeric
## npasses    1     -none-    numeric
## jerr       1     -none-    numeric
## offset     1     -none-    logical
## call       6     -none-    call   
## nobs       1     -none-    numeric
alp_2pp_lasso_demog$dev.ratio
## [1] 0.5311051
# Make predictions on the test data
x_test <- model.matrix(ALP_Percent ~., test_data)[, -1]
predictions <- alp_2pp_lasso_demog %>% predict(newx = x_test)
# Model accuracy
observed <- test_data$ALP_Percent

plot(predictions, observed)

cor(predictions, observed)
##         [,1]
## s0 0.7230284
cor(predictions, observed)^2
##         [,1]
## s0 0.5227701
Metrics::rmse(actual = observed,
              predicted = predictions)
## [1] 10.68348

What if we include the 2pp from the last election. It appears that Swing is defined as the swing to the Coalition between 2013 and 2016. So if we add the swing to the 2016 2pp, then we obtain the 2013 2pp.

When using a Linear Model, we can get an R^2 of about 84%

polling_place_2pp_clean <- polling_place_2pp_clean %>%
  mutate(logit_ALP_Percent =
           log((ALP_Percent / 100) / (1 - ALP_Percent / 100)),
         logit_ALP_Percent_2013 =
           log((ALP_Percent_2013 / 100) / (1 - ALP_Percent_2013 / 100))) %>%
  filter(!is.nan(logit_ALP_Percent)) %>%
  filter(!is.na(logit_ALP_Percent)) %>%
  filter(!is.infinite(logit_ALP_Percent)) %>%
  filter(!is.nan(logit_ALP_Percent_2013)) %>%
  filter(!is.na(logit_ALP_Percent_2013)) %>%
  filter(!is.infinite(logit_ALP_Percent_2013))

alp_2pp_lm_demog_lag <-
  lm(logit_ALP_Percent ~ percent_indig + percent_defacto +
       percent_rent + median_weekly_rent + median_age +
       NonRural_Demographic * median_household_income + logit_ALP_Percent_2013,
     data = polling_place_2pp_clean)

summary(alp_2pp_lm_demog_lag)
## 
## Call:
## lm(formula = logit_ALP_Percent ~ percent_indig + percent_defacto + 
##     percent_rent + median_weekly_rent + median_age + NonRural_Demographic * 
##     median_household_income + logit_ALP_Percent_2013, data = polling_place_2pp_clean)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -2.1110 -0.1297 -0.0042  0.1305  3.1633 
## 
## Coefficients:
##                                                   Estimate    Std. Error t value             Pr(>|t|)    
## (Intercept)                                   1.0959091994  0.0517826740  21.164 < 0.0000000000000002 ***
## percent_indig                                 0.4115344175  0.0603756158   6.816 0.000000000010048187 ***
## percent_defacto                               0.5801063962  0.1692442371   3.428             0.000612 ***
## percent_rent                                 -0.3883763689  0.0480686113  -8.080 0.000000000000000748 ***
## median_weekly_rent                            0.0002499617  0.0000599165   4.172 0.000030546066445595 ***
## median_age                                   -0.0147752213  0.0009830689 -15.030 < 0.0000000000000002 ***
## NonRural_Demographic                         -0.2116056651  0.0346441295  -6.108 0.000000001057798155 ***
## median_household_income                      -0.0000051779  0.0000003219 -16.086 < 0.0000000000000002 ***
## logit_ALP_Percent_2013                        0.8974069831  0.0060571128 148.158 < 0.0000000000000002 ***
## NonRural_Demographic:median_household_income  0.0000020939  0.0000004902   4.272 0.000019646216040251 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.2802 on 7733 degrees of freedom
##   (52 observations deleted due to missingness)
## Multiple R-squared:  0.8392, Adjusted R-squared:  0.839 
## F-statistic:  4485 on 9 and 7733 DF,  p-value: < 0.00000000000000022
anova(alp_2pp_lm_demog_lag)
plot(alp_2pp_lm_demog_lag)

vif(alp_2pp_lm_demog_lag)
##                                percent_indig                              percent_defacto                                 percent_rent                           median_weekly_rent                                   median_age                         NonRural_Demographic                      median_household_income                       logit_ALP_Percent_2013 NonRural_Demographic:median_household_income 
##                                     1.502748                                     1.366026                                     2.262058                                     4.328613                                     2.884030                                    27.998884                                     4.532365                                     1.598839                                    21.897932
gv_alp_2pp_lm_demog_lag <- gvlma(alp_2pp_lm_demog_lag)
summary(alp_2pp_lm_demog_lag)
## 
## Call:
## lm(formula = logit_ALP_Percent ~ percent_indig + percent_defacto + 
##     percent_rent + median_weekly_rent + median_age + NonRural_Demographic * 
##     median_household_income + logit_ALP_Percent_2013, data = polling_place_2pp_clean)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -2.1110 -0.1297 -0.0042  0.1305  3.1633 
## 
## Coefficients:
##                                                   Estimate    Std. Error t value             Pr(>|t|)    
## (Intercept)                                   1.0959091994  0.0517826740  21.164 < 0.0000000000000002 ***
## percent_indig                                 0.4115344175  0.0603756158   6.816 0.000000000010048187 ***
## percent_defacto                               0.5801063962  0.1692442371   3.428             0.000612 ***
## percent_rent                                 -0.3883763689  0.0480686113  -8.080 0.000000000000000748 ***
## median_weekly_rent                            0.0002499617  0.0000599165   4.172 0.000030546066445595 ***
## median_age                                   -0.0147752213  0.0009830689 -15.030 < 0.0000000000000002 ***
## NonRural_Demographic                         -0.2116056651  0.0346441295  -6.108 0.000000001057798155 ***
## median_household_income                      -0.0000051779  0.0000003219 -16.086 < 0.0000000000000002 ***
## logit_ALP_Percent_2013                        0.8974069831  0.0060571128 148.158 < 0.0000000000000002 ***
## NonRural_Demographic:median_household_income  0.0000020939  0.0000004902   4.272 0.000019646216040251 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.2802 on 7733 degrees of freedom
##   (52 observations deleted due to missingness)
## Multiple R-squared:  0.8392, Adjusted R-squared:  0.839 
## F-statistic:  4485 on 9 and 7733 DF,  p-value: < 0.00000000000000022

It would be interesting to see whether 2013 alone is a good predictor. Fitting this model gives an R^2 of 82.7%. This means that the other variables add a bit, but not a huge amount.

alp_2pp_lm_demog_lag <-
  lm(logit_ALP_Percent ~ logit_ALP_Percent_2013,
     data = polling_place_2pp_clean)

summary(alp_2pp_lm_demog_lag)
## 
## Call:
## lm(formula = logit_ALP_Percent ~ logit_ALP_Percent_2013, data = polling_place_2pp_clean)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -2.2357 -0.1441 -0.0115  0.1335  3.2092 
## 
## Coefficients:
##                        Estimate Std. Error t value            Pr(>|t|)    
## (Intercept)            0.127440   0.003472    36.7 <0.0000000000000002 ***
## logit_ALP_Percent_2013 0.954281   0.004944   193.0 <0.0000000000000002 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.2902 on 7793 degrees of freedom
## Multiple R-squared:  0.827,  Adjusted R-squared:  0.827 
## F-statistic: 3.725e+04 on 1 and 7793 DF,  p-value: < 0.00000000000000022
anova(alp_2pp_lm_demog_lag)
plot(alp_2pp_lm_demog_lag)

There are three outlying values - let"s explore these

polling_place_2pp_clean[c(2079, 7267, 2758), ]

These booths have fewer than 40 electors, and fewer than 5 for one of the two parties. They are also “non-standard” booths.

It might be interesting to see which booths deviate from the division mean.

6.3 Trying GBM

Can we fit a gradient boosting machine to improve fit (while trading off raw explainability)

polling_place_2pp_gbm <- polling_place_2pp_clean %>%
  clean_names() %>%
  dplyr::select(-year, -division_nm, -polling_place_id, -polling_place,
                -division, -premises_post_code, -division_id, -state,
                -enrolment, -successful_party, -seat_status, -alp_percent,
                -lnp_percent, -lnp_votes, -alp_votes, -total_votes, -swing) %>%
  mutate(state_ab = factor(state_ab),
         demographic = factor(demographic),
         previous_party = factor(previous_party),
         previous_seat_status = factor(previous_seat_status))


polling_place_2pp_split <- initial_split(polling_place_2pp_gbm, prop = .8)
polling_place_2pp_train <- training(polling_place_2pp_split)
polling_place_2pp_test  <- testing(polling_place_2pp_split)


polling_place_2pp_gbm1 <- gbm::gbm(logit_alp_percent ~.,
                                   data = polling_place_2pp_train,
                                   distribution = "gaussian",
                                   verbose = F,
                                   shrinkage = 0.1,
                                   interaction.depth = 2,
                                   n.minobsinnode = 10,
                                   n.trees = 5000,
                                   cv.folds = 10)

perf_gbm1 <- gbm.perf(polling_place_2pp_gbm1, method = "cv")

polling_place_2pp_prediction_1 <- stats::predict(
  object = polling_place_2pp_gbm1,
  newdata = polling_place_2pp_test,
  n.trees = perf_gbm1)
rmse_fit1 <- Metrics::rmse(actual = polling_place_2pp_test$logit_alp_percent,
                           predicted = polling_place_2pp_prediction_1)
print(rmse_fit1)
## [1] 0.2518863
data.frame(actual = polling_place_2pp_test$logit_alp_percent,
           predicted = polling_place_2pp_prediction_1) %>%
  ggplot(aes(x = predicted, y = actual)) + geom_point() +
  geom_abline(slope = 1, intercept = 0, colour = "blue") +
  theme_classic()

gbm::plot.gbm(polling_place_2pp_gbm1, i.var = 8)

When looking at variable importance, it is clear that the most important factor is the 2pp vote in the previous election.

polling_place_2pp_effects <-
  tibble::as_tibble(gbm::summary.gbm(polling_place_2pp_gbm1, plotit = FALSE))
polling_place_2pp_effects %>% utils::head(n = 15)
polling_place_2pp_effects %>%
  # arrange descending to get the top influencers
  dplyr::arrange(desc(rel.inf)) %>%
  # sort to top 10
  dplyr::top_n(10) %>%
  # plot these data using columns
  ggplot(aes(x = forcats::fct_reorder(.f = var,
                                      .x = rel.inf),
             y = rel.inf,
             fill = rel.inf)) +
  geom_col() +
  # flip
  coord_flip() +
  # format
  scale_color_brewer(palette = "Dark2") +
  theme(axis.title = element_text()) +
  theme_classic() +
  xlab("Features") +
  ylab("Relative Influence") +
  ggtitle("Top 10 Drivers of 2pp vote")

6.3.1 Using caret functions

6.4 Outlying booths